R codes for GC2Inf article

Author

N. Montazeri and R. Mirmahdi

Published

November 10, 2024

Global settings

# Packages
require(readxl)
require(writexl)
require(tidyr)
require(dplyr)
require(AICcmodavg)
require(agricolae)
require(caret)
require(blandr)
require(splines)
require(ggplot2)
require(ggpubr)
library(extrafont)
require(cocor)
require (fitdistrplus)

# update.packages(ask = FALSE)
# font_import(prompt = FALSE) # import system fonts if needed.

# Set seed
set.seed(7910)

# Limit the number of significant digits to 3
options(digits=3)
## Turn in environments/packages/versions/etc ####
sessioninfo::session_info (pkgs = c("attached"))
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.4.2 (2024-10-31)
 os       macOS Sequoia 15.0.1
 system   x86_64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2024-11-10
 pandoc   3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package      * version date (UTC) lib source
 agricolae    * 1.3-7   2023-10-22 [1] CRAN (R 4.4.0)
 AICcmodavg   * 2.3-3   2023-11-16 [1] CRAN (R 4.4.0)
 blandr       * 0.6.0   2024-06-09 [1] CRAN (R 4.4.0)
 caret        * 6.0-94  2023-03-21 [1] CRAN (R 4.4.0)
 cocor        * 1.1-4   2022-06-28 [1] CRAN (R 4.4.0)
 dplyr        * 1.1.4   2023-11-17 [1] CRAN (R 4.4.0)
 extrafont    * 0.19    2023-01-18 [1] CRAN (R 4.4.0)
 fitdistrplus * 1.2-1   2024-07-12 [1] CRAN (R 4.4.0)
 ggplot2      * 3.5.1   2024-04-23 [1] CRAN (R 4.4.0)
 ggpubr       * 0.6.0   2023-02-10 [1] CRAN (R 4.4.0)
 lattice      * 0.22-6  2024-03-20 [1] CRAN (R 4.4.2)
 MASS         * 7.3-61  2024-06-13 [1] CRAN (R 4.4.2)
 readxl       * 1.4.3   2023-07-06 [1] CRAN (R 4.4.0)
 survival     * 3.7-0   2024-06-05 [1] CRAN (R 4.4.2)
 tidyr        * 1.3.1   2024-01-24 [1] CRAN (R 4.4.0)
 writexl      * 1.5.1   2024-10-04 [1] CRAN (R 4.4.1)

 [1] /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library

──────────────────────────────────────────────────────────────────────────────

TCID50 vs. PFU

Data entry and initial cleanup

## TCID50: read original data ####
dat1 <- as.data.frame(read_excel("TCID50_plaque assay.xlsx"))
dat1$logTCID50ml <- as.numeric(dat1$logTCID50ml)
dat1 <- dat1[dat1$dil!= -6,] # At -5, the TCID50 already reached the LOD of 1.468 for 2 out of 3 replicates and no CPE was obsevred for -6 dilution.

# TCID50:PFU ratio
dat1$logtcid2pfu <- dat1$logTCID50ml-dat1$logpfuml
dat1$tcid2pfu <- (10^dat1$logTCID50ml)/(10^dat1$logpfuml)

# mean and std error of stock
##TCID50
mean(dat1$logTCID50ml[dat1$dil == 0], na.rm = TRUE)
[1] 5.81
sd(dat1$logTCID50ml[dat1$dil == 0], na.rm = TRUE)/sqrt(3)
[1] 0.175
str(dat1)
'data.frame':   18 obs. of  6 variables:
 $ dil        : num  0 0 0 -1 -1 -1 -2 -2 -2 -3 ...
 $ rep        : num  1 2 3 1 2 3 1 2 3 1 ...
 $ logpfuml   : num  6.7 6.7 6.7 5.7 5.7 5.7 4.7 4.7 4.7 3.7 ...
 $ logTCID50ml: num  5.47 5.9 6.05 4.98 4.8 ...
 $ logtcid2pfu: num  -1.232 -0.799 -0.649 -0.717 -0.899 ...
 $ tcid2pfu   : num  0.0586 0.1589 0.2244 0.1918 0.1262 ...
saveRDS(dat1, "TCID50_plaque assay.RDS")

Finding the best regression model fit

The data were processed to exclude the last dilution (-5 dilution) due to the fact that, at this point, the TCID50 values had already reached the limit of detection (LOD) of 1.468 in two out of three replicates.

# Linear model
dat1 <- readRDS("TCID50_plaque assay.RDS")

model_l <- lm(logTCID50ml ~ logpfuml, data = dat1)
summary(model_l)

Call:
lm(formula = logTCID50ml ~ logpfuml, data = dat1)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3361 -0.1876  0.0556  0.1659  0.3263 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.0543     0.1315   -0.41     0.69    
logpfuml      0.8625     0.0290   29.75    2e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.21 on 16 degrees of freedom
Multiple R-squared:  0.982, Adjusted R-squared:  0.981 
F-statistic:  885 on 1 and 16 DF,  p-value: 1.96e-15
# # Model diagnosis
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(model_l, las = 1); par(opar)

# Polynomial model
model_p <- lm(logTCID50ml ~ logpfuml + I(logpfuml^2), dat1)
summary(model_p)

Call:
lm(formula = logTCID50ml ~ logpfuml + I(logpfuml^2), data = dat1)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3809 -0.0917 -0.0055  0.1441  0.2338 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)     0.4928     0.2925    1.68   0.1127   
logpfuml        0.5504     0.1545    3.56   0.0028 **
I(logpfuml^2)   0.0372     0.0181    2.05   0.0582 . 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.192 on 15 degrees of freedom
Multiple R-squared:  0.986, Adjusted R-squared:  0.984 
F-statistic:  533 on 2 and 15 DF,  p-value: 1.16e-14
# # Model diagnosis
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(model_p, las = 1); par(opar)

## Model Comparisons 
## RMSE
sqrt(mean(model_l$residuals^2))
[1] 0.198
sqrt(mean(model_p$residuals^2))
[1] 0.175
## AICc comparison
AICcmodavg::aictab (list (model_l=model_l, model_p=model_p),
                    second.ord = TRUE)

Model selection based on AICc:

        K  AICc Delta_AICc AICcWt Cum.Wt   LL
model_p 4 -0.58       0.00   0.63   0.63 5.83
model_l 3  0.51       1.09   0.37   1.00 3.60
# RMASE and model estimates of the best model fit
preferred_fit = model_l
sqrt(mean(preferred_fit$residuals^2))
[1] 0.198
coef(preferred_fit)
(Intercept)    logpfuml 
    -0.0543      0.8625 
confint(preferred_fit)
             2.5 % 97.5 %
(Intercept) -0.333  0.224
logpfuml     0.801  0.924
rm(model_l, model_p, preferred_fit)

Plotting

Predictions from data seq for plotting

# read data and model fits
dat1 <- readRDS("TCID50_plaque assay.RDS")

data_seq <- data.frame(logpfuml=seq(min(dat1$logpfuml), max(dat1$logpfuml), length.out = 50))

# selected model
model_l <- lm(logTCID50ml ~ logpfuml, data = dat1)
summary(model_l)

Call:
lm(formula = logTCID50ml ~ logpfuml, data = dat1)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3361 -0.1876  0.0556  0.1659  0.3263 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.0543     0.1315   -0.41     0.69    
logpfuml      0.8625     0.0290   29.75    2e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.21 on 16 degrees of freedom
Multiple R-squared:  0.982, Adjusted R-squared:  0.981 
F-statistic:  885 on 1 and 16 DF,  p-value: 1.96e-15
# Predictions
## conditional means
ci_fit1 <- predict(model_l, 
                   newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr = lwr,
        ci_upr = upr,
        ci_fit = fit)

## Predicted values
pred_fit1 <- predict(model_l,
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr = lwr,
        pred_upr = upr,
        pred_fit = fit)

## Consolidate Predicted values and conditional means
new_data <- cbind(logpfuml=data_seq$logpfuml, pred_fit1, ci_fit1)

saveRDS(new_data, "pred_tcid50_from_pfu_data_seq.RDS")

rm(data_seq, model_l, dat1, pred_fit1,ci_fit1, new_data)

Plot

#| eval: true
#| echo: true
#| output: true

dat1 <- readRDS("TCID50_plaque assay.RDS")
dat2 <- readRDS("pred_tcid50_from_pfu_data_seq.RDS")

p1 <- ggplot () + 
 geom_point(data=dat1, 
            aes(x=logpfuml, y=logTCID50ml), 
            color = "black", shape = 20, size=1.2) +
 stat_smooth (data=dat1, 
              aes(x=logpfuml, y=logTCID50ml),
              color = c("#00529b"), linewidth=0.5, method="lm", se = FALSE, 
              formula = y~x) +
 geom_ribbon(data = dat2,
             aes(x= logpfuml, ymin = ci_lwr, ymax = ci_upr), 
             alpha = 0.2, fill = "gray10") + 
  geom_line(data = dat2, 
            aes(x = logpfuml, y = pred_lwr), 
            linetype = "longdash", color = "gray15", linewidth=0.1) +
  geom_line(data = dat2, 
            aes(x = logpfuml, y = pred_upr), 
            linetype = "longdash", color = "gray15", linewidth=0.1) +
  scale_x_continuous (breaks = c (2, 3, 4, 5, 6, 7),
                     limits = c(1.5,7)) +
 scale_y_continuous (breaks = c (1,2,3,4,5,6,7,8),
                     limits = c(0.5, 7.5)) +
 labs (x = bquote (Log[10]~PFU~per~ml), size = 3,
       y= bquote (Log[10]~TCID[50]~per~ml), size = 3) +
 theme_bw (base_family="") +
 theme(
  text = element_text(family = "Times"),
  legend.position = "bottom", 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face = "bold", hjust = 0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color = "none")

# this plot will be combined with the TCID50:PFU Beta distribution plot

TCID50:PFU ratio

# reading data
dat1 <- readRDS("TCID50_plaque assay.RDS")

# Average and std error
mean(dat1$logtcid2pfu)
[1] -0.632
sd(dat1$logtcid2pfu)/sqrt(length(dat1$logtcid2pfu))
[1] 0.0745
mean(dat1$tcid2pfu)
[1] 0.303
sd(dat1$tcid2pfu)/sqrt(length(dat1$tcid2pfu))
[1] 0.0574
# Whether the log tcid50:pfu is different across dilutions
dat1$logpfuml <- as.factor(dat1$logpfuml)
aov1 <- aov(tcid2pfu ~ logpfuml, data = dat1);summary(aov1)
            Df Sum Sq Mean Sq F value Pr(>F)   
logpfuml     5  0.787  0.1574    8.59 0.0012 **
Residuals   12  0.220  0.0183                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise comparison by separating w/ and w/o rnase. pfu_rxn has to be a factor
pairwise.t.test(dat1$tcid2pfu, dat1$logpfuml, 
                p.adjust.method = "bonf", 
                alternative = c("two.sided"))

    Pairwise comparisons using t tests with pooled SD 

data:  dat1$tcid2pfu and dat1$logpfuml 

    1.7   2.7   3.7   4.7   5.7  
2.7 0.633 -     -     -     -    
3.7 0.005 0.320 -     -     -    
4.7 0.008 0.498 1.000 -     -    
5.7 0.005 0.320 1.000 1.000 -    
6.7 0.004 0.249 1.000 1.000 1.000

P value adjustment method: bonferroni 
tukey.test <- agricolae::HSD.test(
  aov1, "logpfuml", group=TRUE, alpha = 0.05); tukey.test
$statistics
  MSerror Df  Mean   CV   MSD
   0.0183 12 0.303 44.6 0.371

$parameters
   test   name.t ntr StudentizedRange alpha
  Tukey logpfuml   6             4.75  0.05

$means
    tcid2pfu    std r     se    Min   Max   Q25   Q50   Q75
1.7    0.706 0.2082 3 0.0782 0.5857 0.946 0.586 0.586 0.766
2.7    0.455 0.2276 3 0.0782 0.1918 0.586 0.389 0.586 0.586
3.7    0.162 0.0332 3 0.0782 0.1262 0.192 0.147 0.168 0.180
4.7    0.189 0.0750 3 0.0782 0.1262 0.272 0.147 0.168 0.220
5.7    0.162 0.0332 3 0.0782 0.1262 0.192 0.147 0.168 0.180
6.7    0.147 0.0835 3 0.0782 0.0586 0.224 0.109 0.159 0.192

$comparison
NULL

$groups
    tcid2pfu groups
1.7    0.706      a
2.7    0.455     ab
4.7    0.189      b
5.7    0.162      b
3.7    0.162      b
6.7    0.147      b

attr(,"class")
[1] "group"
# Boxplot of log tcid2pfu:PFU
p <- ggplot(dat1, aes(x = factor(logpfuml), y = tcid2pfu)) +
 geom_boxplot(fill = "grey80", color = "black") +
 labs(x = "log10 PFU per Reaction", 
      y = "logarithmic TCID50:PFU ratio") +
 theme_bw()
p

# Removing two more low concentrations
dat1 <- dat1[!dat1$dil %in% c(-5,-4), ]

# Average and std error
mean(dat1$logtcid2pfu)
[1] -0.808
sd(dat1$logtcid2pfu)/sqrt(length(dat1$logtcid2pfu))
[1] 0.0484
mean(dat1$tcid2pfu)
[1] 0.165
sd(dat1$tcid2pfu)/sqrt(length(dat1$tcid2pfu))
[1] 0.0156
# Whether the log tcid50:pfu is different across dilutions
dat1$logpfuml <- as.factor(dat1$logpfuml)
aov1 <- aov(tcid2pfu ~ logpfuml, data = dat1);summary(aov1)
            Df  Sum Sq Mean Sq F value Pr(>F)
logpfuml     3 0.00269  0.0009    0.24   0.86
Residuals    8 0.02961  0.0037               
# Pairwise comparison by separating w/ and w/o rnase. pfu_rxn has to be a factor
pairwise.t.test(dat1$tcid2pfu, dat1$logpfuml, 
                p.adjust.method = "bonf", 
                alternative = c("two.sided"))

    Pairwise comparisons using t tests with pooled SD 

data:  dat1$tcid2pfu and dat1$logpfuml 

    3.7 4.7 5.7
4.7 1   -   -  
5.7 1   1   -  
6.7 1   1   1  

P value adjustment method: bonferroni 
tukey.test <- agricolae::HSD.test(aov1, "logpfuml", group=T, alpha = 0.05); tukey.test
$statistics
  MSerror Df  Mean   CV   MSD
   0.0037  8 0.165 36.9 0.159

$parameters
   test   name.t ntr StudentizedRange alpha
  Tukey logpfuml   4             4.53  0.05

$means
    tcid2pfu    std r     se    Min   Max   Q25   Q50   Q75
3.7    0.162 0.0332 3 0.0351 0.1262 0.192 0.147 0.168 0.180
4.7    0.189 0.0750 3 0.0351 0.1262 0.272 0.147 0.168 0.220
5.7    0.162 0.0332 3 0.0351 0.1262 0.192 0.147 0.168 0.180
6.7    0.147 0.0835 3 0.0351 0.0586 0.224 0.109 0.159 0.192

$comparison
NULL

$groups
    tcid2pfu groups
4.7    0.189      a
5.7    0.162      a
3.7    0.162      a
6.7    0.147      a

attr(,"class")
[1] "group"

Predicting TCID50:PFU over PFU (obs)

Predict log PFU/ml using TCID50:PFU so that we can compare predictions with observed log PFU/ml.

# Read data
dat1 <- readRDS("TCID50_plaque assay.RDS")

# Fitting the global model
fit1 <- lm(logpfuml ~ tcid2pfu, data = dat1); summary(fit1)

Call:
lm(formula = logpfuml ~ tcid2pfu, data = dat1)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.1062 -0.9678  0.0342  0.8619  2.0708 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    5.847      0.458   12.77  8.3e-10 ***
tcid2pfu      -5.428      1.190   -4.56  0.00032 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.19 on 16 degrees of freedom
Multiple R-squared:  0.565, Adjusted R-squared:  0.538 
F-statistic: 20.8 on 1 and 16 DF,  p-value: 0.000321
sqrt(mean(fit1$residuals^2))
[1] 1.13
coef(fit1)
(Intercept)    tcid2pfu 
       5.85       -5.43 
confint(fit1)
            2.5 % 97.5 %
(Intercept)  4.88   6.82
tcid2pfu    -7.95  -2.91
# Predictions
## Predicted values
pred_fit1 <- predict(fit1,
                     newdata = dat1,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr = lwr,
        pred_upr = upr,
        pred_fit = fit)

## conditional means
ci_fit1 <- predict(fit1, 
                   newdata = dat1,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr = lwr,
        ci_upr = upr,
        ci_fit = fit)

## Consolidate Predicted values and conditional means
merged_data <- cbind(data.frame(logpfuml=dat1$logpfuml), pred_fit1, ci_fit1)
merged_data$dif_PFU_Pred_Obs <- merged_data$pred_fit-merged_data$logpfuml

saveRDS(merged_data, "pred_pfu_from_tcid2pfu.RDS")

rm(dat1, pred_fit1,ci_fit1, merged_data)

Agreement analyses and visualizations

To assess the use of TCID50:PFU for predicting log PFU/ml and compare with observed log PFU/ml.

dat1 <- readRDS("pred_pfu_from_tcid2pfu.RDS")

# Bland-Altman analysis (Tukey mean-difference)
blandr.statistics (dat1$pred_fit, dat1$logpfuml, sig.level=0.95)
Bland-Altman Statistics
=======================
t = 5e-15, df = 17, p-value = 1
alternative hypothesis: true bias is not equal to 0

=======================
Number of comparisons:  18 
Maximum value for average measures:  6.11 
Minimum value for average measures:  1.21 
Maximum value for difference in measures:  2.11 
Minimum value for difference in measures:  -2.07 

Bias:  1.42e-15 
Standard deviation of bias:  1.16 

Standard error of bias:  0.273 
Standard error for limits of agreement:  0.476 

Bias:  1.42e-15 
Bias- upper 95% CI:  0.576 
Bias- lower 95% CI:  -0.576 

Upper limit of agreement:  2.27 
Upper LOA- upper 95% CI:  3.27 
Upper LOA- lower 95% CI:  1.27 

Lower limit of agreement:  -2.27 
Lower LOA- upper 95% CI:  -1.27 
Lower LOA- lower 95% CI:  -3.27 

=======================
Derived measures:  
Mean of differences/means:  1.57 
Point estimate of bias as proportion of lowest average:  1.18e-13 
Point estimate of bias as proportion of highest average 2.32e-14 
Spread of data between lower and upper LoAs:  4.54 
Bias as proportion of LoA spread:  3.12e-14 

=======================
Bias: 
 1.42e-15  ( -0.576  to  0.576 ) 
ULoA: 
 2.27  ( 1.27  to  3.27 ) 
LLoA: 
 -2.27  ( -3.27  to  -1.27 ) 
# Difference between W/O over W/ RNase treatment
mean(dat1$dif_PFU_Pred_Obs)
[1] 1.42e-15
sd(dat1$dif_PFU_Pred_Obs)/sqrt(length(dat1))
[1] 0.41
mean(10^dat1$dif_PFU_Pred_Obs)
[1] 11.8
sd(10^dat1$dif_PFU_Pred_Obs)/sqrt(length(dat1))
[1] 10.6
## check the assumption of normality for the differences
shapiro.test(dat1$dif_PFU_Pred_Obs)

    Shapiro-Wilk normality test

data:  dat1$dif_PFU_Pred_Obs
W = 1, p-value = 0.9
## t-test
t.test(
 dat1$pred_fit, dat1$logpfuml, 
 alternative = c("two.sided"),
 paired = TRUE)

    Paired t-test

data:  dat1$pred_fit and dat1$logpfuml
t = 5e-15, df = 17, p-value = 1
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -0.576  0.576
sample estimates:
mean difference 
       1.42e-15 

Beta distribution

iters=1000
## reading data
dat1 <- readRDS("TCID50_plaque assay.RDS")

# scaled value (min-max normalization)
min_logtcid2pfu <- min(dat1$logtcid2pfu)
max_logtcid2pfu <- max(dat1$logtcid2pfu)

dat1$logtcid2pfu_s <- (dat1$logtcid2pfu - min_logtcid2pfu) / (max_logtcid2pfu - min_logtcid2pfu)

dat1$logtcid2pfu_s[dat1$logtcid2pfu_s == 0] <- 0.00001
dat1$logtcid2pfu_s[dat1$logtcid2pfu_s == 1] <- 0.99999

# Beta dist fit: W/ RNase
fit_beta <- fitdist(dat1$logtcid2pfu_s, "beta", 
                    start = list(shape1 = 2, shape2 = 2),
                    method = 'mge'); summary (fit_beta)
Warning in fitdist(dat1$logtcid2pfu_s, "beta", start = list(shape1 = 2, :
maximum GOF estimation has a default 'gof' argument set to 'CvM'
Fitting of the distribution ' beta ' by maximum goodness-of-fit 
Parameters : 
       estimate
shape1     1.70
shape2     1.88
Loglikelihood:  -12.5   AIC:  29.1   BIC:  30.9 
# Bootstrap simulation of uncertainty
f_beta_boot <- bootdist(fit_beta, bootmethod="param", niter = iters)
summary(f_beta_boot)
Parametric bootstrap medians and 95% percentile CI 
       Median  2.5% 97.5%
shape1   1.85 0.974  4.47
shape2   2.02 1.045  5.31
# parameters of Beta distribution (scaled)

shape1 <- quantile(f_beta_boot$estim[, 1], probs = c(0.5))
shape2 <- quantile(f_beta_boot$estim[, 2], probs = c(0.5))

rbeta.est.boot <- rbeta (iters, shape1, shape2)

# Plotting density distributions
data_seq <- seq(0, 1, length = 100)
dbeta2 <- dbeta (data_seq, shape1, shape2)

# Create a data frame for the beta distribution to overlay
dat1_beta <- data.frame(x = data_seq, density = dbeta2)

scale_back <- function(x) {
  x * (max_logtcid2pfu - min_logtcid2pfu) + min_logtcid2pfu
}

x_ticks_scaled <- seq(0, 1, length.out = 6)
x_ticks_actual <- scale_back(x_ticks_scaled)

# Summaries
beta_sum <- quantile(rbeta.est.boot, prob=c(0.025, 0.50, 0.975))
(beta_sum_actual <- scale_back (beta_sum))
  2.5%    50%  97.5% 
-1.147 -0.646 -0.159 
est_0.5 <- beta_sum[2]
est_0.5_actual <- beta_sum_actual[2]

p2 <- ggplot(data=dat1, aes(x = logtcid2pfu_s)) +
 geom_histogram(aes(y = after_stat(density)), bins = 15, 
                fill = "darkgray", color = "black") +
 geom_line(data=dat1_beta, 
           aes(x = x, y = density),
           color = "#fb6502", linetype = "dashed", linewidth = 0.5) +
 geom_rug(aes(x = logtcid2pfu_s), sides = "b", color = "black") +
 geom_vline(xintercept = est_0.5,
             color = "#00529b", linetype = "solid", linewidth = 0.5) +
 annotate("text", x = est_0.5,
          y = Inf, label = paste("Median =", round(est_0.5_actual, 2)),
          vjust = 5, hjust = -0.2,
          color = "#00529b", size = 2.5,
          family = "times") + 
 scale_x_continuous(breaks = x_ticks_scaled, 
                    labels = round(x_ticks_actual, 1)) +
 scale_y_continuous (breaks = c (0, 1, 2, 3, 4, 5),
                    limits = c(0, 5)) +
 labs(
  x = bquote (Log[10]~TCID[50]:PFU~ratio),
  y = "Density") +
 theme_bw (base_family="") +
 theme( 
  text = element_text(family = "Times"),
  legend.position = "inside",
  legend.position.inside = c(0.3, 0.9), 
  legend.direction="horizontal",
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face = "bold", hjust = 0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color = "none")

combined_plot <- ggpubr::ggarrange(p1, #from TCID50 over PFU
                                   p2, ncol = 2,
                                   labels = c("A", "B"),
                                   font.label = list(size=9, face="bold", family="times"
                                   ))
combined_plot

ggsave(plot=combined_plot, file="TuV_gc_pfu_and_beta_dist.tiff", width=18, height=7, units = c("cm"))

rm(dat1, min_logtcid2pfu, max_logtcid2pfu, iters, fit_beta, shape1, shape2, f_beta_boot, dbeta2, data_seq, dat1_beta, rbeta.est.boot, est_0.5, est_0.5_actual, x_ticks_actual, x_ticks_scaled, scale_back, p1, p2)

Model validation through train data (using original data)

# Read data
dat1 <- readRDS("TCID50_plaque assay.RDS")

# Initialize a list to store the RMSE results
rmse_results <- list()
r2_results <- list()
train_data_list <- list()
test_data_list <- list()

index <- createDataPartition(dat1$logpfuml, 
                             p = 0.70, times = 5, list = TRUE)

# Loop over each partition and check for data size
for (i in 1:length(index)) {
  train_data <- dat1[index[[i]], ]
  test_data <- dat1[-index[[i]], ]
  
  # Check if both train and test sets have more than one row
  if (nrow(train_data) > 1 & nrow(test_data) > 1) {
    # Train the model on train_data
    model <- train(logpfuml ~ tcid2pfu, 
                   data = train_data, 
                   method = "lm", 
                   trControl = trainControl(method = "boot", number = 500))
    
    # Predict on test data
    pred <- predict(model, newdata = test_data)
    
    # Calculate RMSE
    rmse <- caret::RMSE(pred = pred, obs = test_data$logpfuml)
    r2 <- caret::R2(pred = pred, obs = test_data$logpfuml)    
    
    # Store the result
    rmse_results[[i]] <- rmse
    r2_results[[i]] <- r2
    train_data_list[[i]] <- train_data
    test_data_list[[i]] <- test_data 
    
  } else {
    # If either train or test set is too small, skip this iteration
    cat("Skipping iteration", i, "due to small data size\n")
  }
}
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
# Check if any valid RMSE results were obtained
rmse_results; summary(unlist(rmse_results))
[[1]]
[1] 0.889

[[2]]
[1] 0.252

[[3]]
[1] 1.43

[[4]]
[1] 0.872

[[5]]
[1] 1.6
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.252   0.872   0.889   1.007   1.429   1.595 
r2_results; summary(unlist(r2_results))
[[1]]
[1] 1

[[2]]
[1] 1

[[3]]
[1] 1

[[4]]
[1] 1

[[5]]
[1] 1
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 
train_data_list[1]
[[1]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
1    0   1      6.7        5.47      -1.232   0.0586
2    0   2      6.7        5.90      -0.799   0.1589
3    0   3      6.7        6.05      -0.649   0.2244
4   -1   1      5.7        4.98      -0.717   0.1918
6   -1   3      5.7        4.93      -0.774   0.1683
7   -2   1      4.7        3.80      -0.899   0.1262
8   -2   2      4.7        4.13      -0.566   0.2719
9   -2   3      4.7        3.93      -0.774   0.1683
10  -3   1      3.7        2.93      -0.774   0.1683
11  -3   2      3.7        2.80      -0.899   0.1262
12  -3   3      3.7        2.98      -0.717   0.1918
13  -4   1      2.7        2.47      -0.232   0.5861
14  -4   2      2.7        2.47      -0.232   0.5861
15  -4   3      2.7        1.98      -0.717   0.1918
17  -5   2      1.7        1.47      -0.232   0.5857
18  -5   3      1.7        1.68      -0.024   0.9463
test_data_list
[[1]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
5   -1   2      5.7        4.80      -0.899    0.126
16  -5   1      1.7        1.47      -0.232    0.586

[[2]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
8   -2   2      4.7        4.13      -0.566    0.272
13  -4   1      2.7        2.47      -0.232    0.586

[[3]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
7   -2   1      4.7        3.80      -0.899    0.126
18  -5   3      1.7        1.68      -0.024    0.946

[[4]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
7   -2   1      4.7        3.80      -0.899    0.126
16  -5   1      1.7        1.47      -0.232    0.586

[[5]]
   dil rep logpfuml logTCID50ml logtcid2pfu tcid2pfu
8   -2   2      4.7        4.13      -0.566    0.272
15  -4   3      2.7        1.98      -0.717    0.192
rm(dat1, r2_results, rmse_results, test_data, train_data)

RT-qPCR calibration curve

Data entry and initial cleanup

dat1 <- as.data.frame(read_excel("tv_std_curve.xlsx"))

# Remove rows with any NA values
dat1 <- dat1[complete.cases(dat1), ]
dat1$ct <- as.numeric(dat1$ct)
dat1 <- dat1[, c("ct", "log_gc_rxn")]

saveRDS (dat1, "tv_std_curve.RDS")

rm(dat1)

Regression and outlier assessment

Comparing model fits (w/ or w/o Ct>40)

dat1 <- readRDS ("tv_std_curve.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# With Ct higher than 40 (row 59)
# Fit the linear model
fit_w40 <- lm(ct ~ log_gc_rxn, 
           data = dat1, na.action=na.omit)
summary(fit_w40)

Call:
lm(formula = ct ~ log_gc_rxn, data = dat1, na.action = na.omit)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1569 -0.3694 -0.0106  0.2567  1.9609 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  48.6046     0.2474   196.5   <2e-16 ***
log_gc_rxn   -3.8479     0.0415   -92.6   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.576 on 55 degrees of freedom
Multiple R-squared:  0.994, Adjusted R-squared:  0.994 
F-statistic: 8.58e+03 on 1 and 55 DF,  p-value: <2e-16
# Model diagnosis
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit_w40, las = 1); par(opar)

# Remove Ct higher than 40 (row 59)
dat2 <- readRDS ("tv_std_curve.RDS")
dat2$ct[dat2$ct>40] <- NA
dat2 <- dat2[complete.cases(dat2), ]

# Model without outlier
fit_wo40 <- lm (ct ~ log_gc_rxn, 
            data=dat2, na.action=na.omit)
summary(fit_wo40)

Call:
lm(formula = ct ~ log_gc_rxn, data = dat2, na.action = na.omit)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1082 -0.3672  0.0466  0.3218  1.0392 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   48.361      0.228     212   <2e-16 ***
log_gc_rxn    -3.811      0.038    -100   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.511 on 54 degrees of freedom
Multiple R-squared:  0.995, Adjusted R-squared:  0.995 
F-statistic: 1.01e+04 on 1 and 54 DF,  p-value: <2e-16
# Model diagnosis of second model
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit_wo40, las = 1); par(opar)

## RMSE
sqrt(mean((dat1$ct - predict.lm(fit_w40))^2))
[1] 0.566
sqrt(mean((dat2$ct  - predict.lm(fit_wo40))^2))
[1] 0.502
## AICc comparison
AICcmodavg::aictab (list (fit_w40=fit_w40, fit_wo40=fit_wo40), 
                    second.ord = TRUE)

Model selection based on AICc:

         K  AICc Delta_AICc AICcWt Cum.Wt    LL
fit_wo40 3  88.2        0.0      1      1 -40.9
fit_w40  3 103.3       15.2      0      1 -48.4
# RMASE and model estimates of the best model fit
## selected fit (without including ct > 40)
preferred_fit = fit_wo40
sqrt(mean(preferred_fit$residuals^2))
[1] 0.502
coef(preferred_fit)
(Intercept)  log_gc_rxn 
      48.36       -3.81 
confint(preferred_fit)
            2.5 % 97.5 %
(Intercept) 47.90  48.82
log_gc_rxn  -3.89  -3.74
rm(dat1, dat2, fit_w40, fit_wo40, preferred_fit)

Ct 16-40 predictions of GC w/ and w/o removing Ct>40

Reverse models for predictions

#| eval: true
#| echo: true
#| output: true

dat1 <- readRDS ("tv_std_curve.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# With Ct higher than 40 (row 59)
# Fit the linear model
fit_w40 <- lm(log_gc_rxn ~ ct, 
           data = dat1, na.action=na.omit)

# Remove Ct higher than 40 (row 59)
dat2 <- readRDS ("tv_std_curve.RDS")
dat2$ct[dat2$ct>40] <- NA
dat2 <- dat2[complete.cases(dat2), ]

# Model without outlier
fit_wo40 <- lm (log_gc_rxn ~ ct, 
            data=dat2, na.action=na.omit)
summary(fit_wo40)

Call:
lm(formula = log_gc_rxn ~ ct, data = dat2, na.action = na.omit)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.29120 -0.09454  0.00495  0.07799  0.27799 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  12.6514     0.0713     177   <2e-16 ***
ct           -0.2610     0.0026    -100   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.134 on 54 degrees of freedom
Multiple R-squared:  0.995, Adjusted R-squared:  0.995 
F-statistic: 1.01e+04 on 1 and 54 DF,  p-value: <2e-16
# Save models to use them for predictions
list (fit_w40=fit_w40, fit_w40s=summary(fit_w40),
      fit_wo40=fit_wo40, fit_wo40s=summary(fit_wo40)) |>
 saveRDS("pcr_model_fits.RDS")

Prediction calculations

# Calling model fits and data
model_fits <- readRDS("pcr_model_fits.RDS")

dat1 <- readRDS ("tv_std_curve.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# Create a seq of numbers roughly within the limit of cal curve
data_seq <- data.frame(ct=seq(16, 40, length.out = 50))

# Predictions of GC with ct > 40
## conditional means
ci_fit_w <- predict(model_fits$fit_w40, 
                   newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr_w = lwr,
        ci_upr_w = upr,
        ci_fit_w = fit)

## Predicted values
pred_fit_w <- predict(model_fits$fit_w40,
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_w = lwr,
        pred_upr_w = upr,
        pred_fit_w = fit)

# Predictions of GC without ct > 40
## conditional means
ci_fit_wo <- predict(model_fits$fit_wo40, 
                   newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr_wo = lwr,
        ci_upr_wo = upr,
        ci_fit_wo = fit)

## Predicted values
pred_fit_wo <- predict(model_fits$fit_wo40, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_wo = lwr,
        pred_upr_wo = upr,
        pred_fit_wo = fit)

## Consolidate Predicted values and conditional means
merged_data <- cbind(data_seq, pred_fit_w, ci_fit_w, pred_fit_wo, ci_fit_wo)

# Difference between conditional mean and predicted values and between redicted values
merged_data$dif_pred_fit_wo_w <- merged_data$pred_fit_wo - merged_data$pred_fit_w

saveRDS(merged_data, "pred_gc_Ct16to40.RDS")

rm(merged_data, dat1, pred_fit_w, pred_fit_wo, ci_fit_w, ci_fit_wo)

Models agreement analyses and visualizations

# Read data
dat1 <- readRDS("pred_gc_Ct16to40.RDS")

# Bland-Altman analysis
blandr.statistics (dat1$pred_fit_wo, dat1$pred_fit_w, sig.level=0.95)
Bland-Altman Statistics
=======================
t = -5, df = 49, p-value = 3e-05
alternative hypothesis: true bias is not equal to 0

=======================
Number of comparisons:  50 
Maximum value for average measures:  8.47 
Minimum value for average measures:  2.24 
Maximum value for difference in measures:  0.0203 
Minimum value for difference in measures:  -0.0456 

Bias:  -0.0127 
Standard deviation of bias:  0.0196 

Standard error of bias:  0.00278 
Standard error for limits of agreement:  0.00477 

Bias:  -0.0127 
Bias- upper 95% CI:  -0.00707 
Bias- lower 95% CI:  -0.0182 

Upper limit of agreement:  0.0258 
Upper LOA- upper 95% CI:  0.0354 
Upper LOA- lower 95% CI:  0.0162 

Lower limit of agreement:  -0.0511 
Lower LOA- upper 95% CI:  -0.0415 
Lower LOA- lower 95% CI:  -0.0607 

=======================
Derived measures:  
Mean of differences/means:  -0.432 
Point estimate of bias as proportion of lowest average:  -0.566 
Point estimate of bias as proportion of highest average -0.149 
Spread of data between lower and upper LoAs:  0.0769 
Bias as proportion of LoA spread:  -16.4 

=======================
Bias: 
 -0.0127  ( -0.0182  to  -0.00707 ) 
ULoA: 
 0.0258  ( 0.0162  to  0.0354 ) 
LLoA: 
 -0.0511  ( -0.0607  to  -0.0415 ) 
# Main difference and std dev of difference: Prediction
shapiro.test(dat1$dif_pred_fit_wo_w)

    Shapiro-Wilk normality test

data:  dat1$dif_pred_fit_wo_w
W = 1, p-value = 0.06
t.test(
 dat1$pred_fit_wo, dat1$pred_fit_w, 
 alternative = c("two.sided"),
 paired = TRUE)

    Paired t-test

data:  dat1$pred_fit_wo and dat1$pred_fit_w
t = -5, df = 49, p-value = 3e-05
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -0.01823 -0.00707
sample estimates:
mean difference 
        -0.0127 
mean(dat1$pred_fit_wo)
[1] 5.34
mean(dat1$pred_fit_w)
[1] 5.36
mean(dat1$dif_pred_fit_wo_w)
[1] -0.0127
sd(dat1$dif_pred_fit_wo_w)/sqrt(length(dat1))
[1] 0.00524
rm(dat1)

Ct 40-50 predictions of GC w/ and w/o removing Ct>40

Prediction calculations

# Calling model fits
model_fits <- readRDS("pcr_model_fits.RDS")

# calling model fits
dat1 <- readRDS ("tv_std_curve.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# Create a seq of numbers roughly within the limit of cal curve
data_seq <- data.frame(ct=seq(40, 50, length.out = 50))

# Predictions of GC with ct > 40
## conditional means
ci_fit_w <- predict(model_fits$fit_w40, 
                   newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr_w = lwr,
        ci_upr_w = upr,
        ci_fit_w = fit)

## Predicted values
pred_fit_w <- predict(model_fits$fit_w40,
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_w = lwr,
        pred_upr_w = upr,
        pred_fit_w = fit)

# Predictions of GC without ct > 40
## conditional means
ci_fit_wo <- predict(model_fits$fit_wo40, 
                   newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr_wo = lwr,
        ci_upr_wo = upr,
        ci_fit_wo = fit)

## Predicted values
pred_fit_wo <- predict(model_fits$fit_wo40, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_wo = lwr,
        pred_upr_wo = upr,
        pred_fit_wo = fit)

# Merge dataframes
merged_data <- cbind(data_seq, pred_fit_w, ci_fit_w, pred_fit_wo, ci_fit_wo)

# Difference between conditional mean and predicted values and between redicted values
merged_data$dif_pred_fit_wo_w <- merged_data$pred_fit_wo - merged_data$pred_fit_w

saveRDS(merged_data, "pred_gc_Ct40to50.RDS")

rm(merged_data, dat1, pred_fit_w, pred_fit_wo, ci_fit_w, ci_fit_wo)

Models agreement analyses and visualizations

# Read data
dat1 <- readRDS("pred_gc_Ct40to50.RDS")

# Bland-Altman analysis (Tukey mean-difference)
# Conditional means based on different cal curves
blandr.statistics (dat1$pred_fit_wo, dat1$pred_fit_w, sig.level=0.95)
Bland-Altman Statistics
=======================
t = -50, df = 49, p-value = <2e-16
alternative hypothesis: true bias is not equal to 0

=======================
Number of comparisons:  50 
Maximum value for average measures:  2.24 
Minimum value for average measures:  -0.361 
Maximum value for difference in measures:  -0.0456 
Minimum value for difference in measures:  -0.0731 

Bias:  -0.0594 
Standard deviation of bias:  0.00818 

Standard error of bias:  0.00116 
Standard error for limits of agreement:  0.00199 

Bias:  -0.0594 
Bias- upper 95% CI:  -0.057 
Bias- lower 95% CI:  -0.0617 

Upper limit of agreement:  -0.0433 
Upper LOA- upper 95% CI:  -0.0393 
Upper LOA- lower 95% CI:  -0.0473 

Lower limit of agreement:  -0.0754 
Lower LOA- upper 95% CI:  -0.0714 
Lower LOA- lower 95% CI:  -0.0794 

=======================
Derived measures:  
Mean of differences/means:  -15.7 
Point estimate of bias as proportion of lowest average:  16.5 
Point estimate of bias as proportion of highest average -2.66 
Spread of data between lower and upper LoAs:  0.0321 
Bias as proportion of LoA spread:  -185 

=======================
Bias: 
 -0.0594  ( -0.0617  to  -0.057 ) 
ULoA: 
 -0.0433  ( -0.0473  to  -0.0393 ) 
LLoA: 
 -0.0754  ( -0.0794  to  -0.0714 ) 
# Main difference and std dev of difference: Prediction
shapiro.test(dat1$dif_pred_fit_wo_w)

    Shapiro-Wilk normality test

data:  dat1$dif_pred_fit_wo_w
W = 1, p-value = 0.06
t.test(
 dat1$pred_fit_wo, dat1$pred_fit_w, 
 alternative = c("two.sided"),
 paired = TRUE)

    Paired t-test

data:  dat1$pred_fit_wo and dat1$pred_fit_w
t = -51, df = 49, p-value <2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -0.0617 -0.0570
sample estimates:
mean difference 
        -0.0594 
mean(dat1$pred_fit_wo)
[1] 0.908
mean(dat1$pred_fit_w)
[1] 0.967
mean(dat1$dif_pred_fit_wo_w)
[1] -0.0594
sd(dat1$dif_pred_fit_wo_w)/sqrt(length(dat1))
[1] 0.00219
rm(dat1)

Plot of RT-qPCR calibration curve

Generate predictions and intervals

# calling data and model fits after emoving Ct>40
dat1 <- readRDS ("tv_std_curve.RDS")
dat1$ct[dat1$ct>40] <- NA
dat1 <- dat1[complete.cases(dat1), ] 

fit0 <- lm(ct ~ log_gc_rxn, data = dat1)

# Create a sequence of values for logpfuml for prediction
data_seq <- data.frame(log_gc_rxn=seq(min(dat1$log_gc_rxn), max(dat1$log_gc_rxn), length.out = 20))

pred_fit <- predict(fit0, 
                    newdata = data_seq,
                   interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr = lwr,
        pred_upr = upr,
        pred_fit = fit)

ci_fit <- predict(fit0, newdata = data_seq,
                   interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr = lwr,
        ci_upr = upr,
        ci_fit = fit)

merged_data <- cbind(data_seq, pred_fit, ci_fit)

saveRDS(merged_data, "RT-qPCR_cal_curve_predicitons.RDS")

rm(dat1, fit0, pred_fit, ci_fit, merged_data)

Plot

# calling data and model fits after emoving Ct>40
dat1 <- readRDS ("tv_std_curve.RDS")
dat1$ct[dat1$ct>40] <- NA
dat1 <- dat1[complete.cases(dat1), ] 
dat2 <- readRDS("RT-qPCR_cal_curve_predicitons.RDS")

p <- ggplot () + 
geom_point (data=dat1, 
            aes(x=log_gc_rxn, y=ct),
            color = "black", shape = 16, size=1.2) +
 stat_smooth (data=dat1, 
              aes(x=log_gc_rxn, y=ct),
              color = c("#00529b"), linewidth=0.5, method="lm", se = FALSE, 
              formula = y~x) +
 geom_ribbon(data = dat2,
             aes(x=log_gc_rxn, ymin = ci_lwr, ymax = ci_upr), 
             alpha = 0.2, fill = "gray10") + 
 geom_line(data = dat2, 
           aes(x=log_gc_rxn, y = pred_lwr), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
 geom_line(data = dat2, 
           aes(x = log_gc_rxn, y = pred_upr), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
 scale_x_continuous (breaks = c (2, 3, 4, 5, 6, 7, 8, 9),
                     limits = c(2, 8.5)) +
 scale_y_continuous (breaks = c (15, 20, 25, 30, 35, 40, 45),
                     limits = c(15, 45)) +
 labs (x = bquote (Log[10]~Genomic~Copies~Per~Reaction), size = 3,
       y= bquote (Cycle~Threshold~(Ct)), size = 3) +
 theme_bw (base_family="") +
 theme(
  text = element_text(family = "Times"),
  legend.position = "bottom", 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face = "bold"),
  legend.title=element_text (size = 6, face = "bold"),
  plot.title = element_text(color="black", size=10, face = "bold", hjust = 0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color = "none")
 
 p

ggsave(plot=p, file="tv_cal_curve_wo outlier.pdf", width=8, height=5, units = c("cm"))

GC over PFU

Data entry and initial cleanup

dat1 <- as.data.frame(read_excel("pfu_ct_022124.xlsx"))
dat1$dil <- as.factor(dat1$dil)
dat1$rnase <- as.factor(dat1$rnase)
dat1$pfu_rxn <- as.numeric(dat1$pfu_rxn)
dat1$ct <- as.numeric(dat1$ct)
Warning: NAs introduced by coercion
# Ct values less than 10 or higher than 40 (it  belogs to -1 log PFU/ml==> let's keep)
dat1 |>
 filter(ct>40)
  dil rnase   ct pfu_rxn rep
1   6     w 41.3    -1.2   1
dat1$ct[dat1$ct>40] <- NA
# "1   6     w 41.26563    -1.2   1" was removed.

dat1 |>
 filter(ct<=10)
[1] dil     rnase   ct      pfu_rxn rep    
<0 rows> (or 0-length row.names)
dat1$ct[dat1$ct<=10] <- NA # to replace with NA. No datapoint was removed.

saveRDS(dat1, "pfu_ct_022124_cleanedup.RDS")
rm(dat1)

Calculate GC and averaging tech reps

# Calling model fits and data
model_fits <- readRDS("pcr_model_fits.RDS")
dat1 <- readRDS ("pfu_ct_022124_cleanedup.RDS")

## Predicted values
pred_fit <- predict(model_fits$fit_wo40,
                     newdata = dat1,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr = lwr,
        pred_upr = upr,
        pred_fit = fit)

ci_fit <- predict(model_fits$fit_wo40,
                     newdata = dat1,
                     interval = "confidence", level = 0.95) |>
 as.data.frame() |> 
 rename(ci_lwr = lwr,
        ci_upr = upr,
        ci_fit = fit)

merged_data <- cbind(dat1, pred_fit, ci_fit)

str(merged_data)
'data.frame':   84 obs. of  11 variables:
 $ dil     : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ rnase   : Factor w/ 2 levels "w","wo": 1 1 2 2 1 1 2 2 1 1 ...
 $ ct      : num  NA 16.4 16.5 16.2 21.2 ...
 $ pfu_rxn : num  4.8 4.8 4.8 4.8 3.8 3.8 3.8 3.8 2.8 2.8 ...
 $ rep     : num  1 1 1 1 1 1 1 1 1 1 ...
 $ pred_fit: num  NA 8.37 8.36 8.41 7.12 ...
 $ pred_lwr: num  NA 8.09 8.08 8.14 6.85 ...
 $ pred_upr: num  NA 8.64 8.63 8.69 7.39 ...
 $ ci_fit  : num  NA 8.37 8.36 8.41 7.12 ...
 $ ci_lwr  : num  NA 8.3 8.29 8.35 7.08 ...
 $ ci_upr  : num  NA 8.43 8.42 8.48 7.17 ...
dat1 <- merged_data[complete.cases(merged_data), ] |>
 relocate(ct, .after=rep)

# saving data as Excel file
write_xlsx(dat1, "pred_GCofPFU_from_ct.xlsx")

# saving data as RDS file
saveRDS(dat1, "pred_GCofPFU_from_ct.RDS")

rm(dat1)

# Averaging tech reps
dat1 <- readRDS("pred_GCofPFU_from_ct.RDS") |>
 group_by(dil, pfu_rxn, rnase, rep) |>
 summarise(across(pred_fit:ci_upr, \(x) mean (x, na.rm= TRUE))) |>
 as.data.frame()
`summarise()` has grouped output by 'dil', 'pfu_rxn', 'rnase'. You can override
using the `.groups` argument.
# Calculate GC:PFU ratio 
dat1$loggc2pfu <- dat1$pred_fit-dat1$pfu_rxn
dat1$gc2pfu <- (10^dat1$pred_fit)/(10^dat1$pfu_rxn)

saveRDS (dat1, "pred_GCfromPFU_ave.RDS")
write_xlsx (dat1, "pred_GCfromPFU_ave.xlsx")

rm(dat1, merged_data, pred_fit, ci_fit)

GC over PFU Regression W/ and W/O RNase

# Read data
dat1_w <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="w") |>
 as.data.frame()

dat1_wo <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="wo") |>
 as.data.frame()

## model fit w/ RNase
fit_w <- lm (pred_fit ~ pfu_rxn, 
            data=dat1_w, na.action=na.omit)
summary(fit_w)

Call:
lm(formula = pred_fit ~ pfu_rxn, data = dat1_w, na.action = na.omit)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.3481 -0.0512  0.0591  0.1395  0.2460 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.8407     0.0535    71.7   <2e-16 ***
pfu_rxn       0.9307     0.0199    46.8   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.182 on 19 degrees of freedom
Multiple R-squared:  0.991, Adjusted R-squared:  0.991 
F-statistic: 2.19e+03 on 1 and 19 DF,  p-value: <2e-16
## Model diagnosis w/ RNase
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit_w, las = 1); par(opar)

## model fit w/o RNase
fit_wo <- lm (pred_fit ~ pfu_rxn, 
            data=dat1_wo, na.action=na.omit)
summary(fit_wo)

Call:
lm(formula = pred_fit ~ pfu_rxn, data = dat1_wo, na.action = na.omit)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.4018 -0.1054 -0.0347  0.1941  0.3978 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.0683     0.0638    63.8   <2e-16 ***
pfu_rxn       0.9284     0.0237    39.2   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.217 on 19 degrees of freedom
Multiple R-squared:  0.988, Adjusted R-squared:  0.987 
F-statistic: 1.53e+03 on 1 and 19 DF,  p-value: <2e-16
## Model diagnosis w/o RNase
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(fit_wo, las = 1); par(opar)

# Summary of coefficients and 95% CI
coef(fit_w)
(Intercept)     pfu_rxn 
      3.841       0.931 
confint(fit_w)
            2.5 % 97.5 %
(Intercept) 3.729  3.953
pfu_rxn     0.889  0.972
coef(fit_wo)
(Intercept)     pfu_rxn 
      4.068       0.928 
confint(fit_wo)
            2.5 % 97.5 %
(Intercept) 3.935  4.202
pfu_rxn     0.879  0.978
## Saving model fits
list(fit_w=fit_w, fit_ws=summary(fit_w), 
     fit_wo=fit_wo, fit_wos=summary(fit_wo)) |>
 saveRDS("gc_pfu_model_fits.RDS")

Correlation b/w GC and PFU

dat1_w <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="w") |>
 as.data.frame()

dat1_wo <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="wo") |>
 as.data.frame()

# Correlations bw pfu_rxn and gc_rxn ####
cor_w <- cor.test (dat1_w$pfu_rxn, dat1_w$pred_fit, 
                   method = c("pearson")); cor_w

    Pearson's product-moment correlation

data:  dat1_w$pfu_rxn and dat1_w$pred_fit
t = 47, df = 19, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.989 0.998
sample estimates:
  cor 
0.996 
cor_wo <- cor.test (dat1_wo$pfu_rxn, dat1_wo$pred_fit, 
                   method = c("pearson")); cor_wo

    Pearson's product-moment correlation

data:  dat1_wo$pfu_rxn and dat1_wo$pred_fit
t = 39, df = 19, p-value <2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.985 0.998
sample estimates:
  cor 
0.994 
# Fisher's z to compare the two correlations
r1.jk <- cor_w$estimate
n1 <- nrow(dat1_w)
r2.hm <- cor_wo$estimate
n2 <- nrow(dat1_wo)

cocor::cocor.indep.groups(
 r1.jk, r2.hm, n1, n2,
 alpha = 0.05,
 alternative = "two.sided",
 test = "all"
)

  Results of a comparison of two correlations based on independent groups

Comparison between r1.jk = 0.996 and r2.hm = 0.994
Difference: r1.jk - r2.hm = 0.0018
Group sizes: n1 = 21, n2 = 21
Null hypothesis: r1.jk is equal to r2.hm
Alternative hypothesis: r1.jk is not equal to r2.hm (two-sided)
Alpha: 0.05

fisher1925: Fisher's z (1925)
  z = 0.5291, p-value = 0.5968
  Null hypothesis retained

zou2007: Zou's (2007) confidence interval
  95% confidence interval for r1.jk - r2.hm: -0.0057 0.0114
  Null hypothesis retained (Interval includes 0)
rm(dat1_w, dat1_wo, cor_w, cor_wo, r1.jk, n1, r2.hm, n2)

GC over PFU Predictions

# Predictions of GC
## data, model fits, and generate seq
dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
model_fits <- readRDS("gc_pfu_model_fits.RDS")

data_seq <- data.frame(pfu_rxn=seq(min(dat1$pfu_rxn), max(dat1$pfu_rxn), length.out = 50))

## W/ RNase
pred_fit_w <- predict(model_fits$fit_w, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_w = lwr,
        pred_upr_w = upr,
        pred_fit_w = fit)

ci_fit_w <- predict(model_fits$fit_w, 
                     newdata = data_seq,
                     interval = "confidence", level = 0.95)  |>
 as.data.frame() |> 
 rename(ci_lwr_w = lwr,
        ci_upr_w = upr,
        ci_fit_w = fit)

## W/o RNase
pred_fit_wo <- predict(model_fits$fit_wo, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_wo = lwr,
        pred_upr_wo = upr,
        pred_fit_wo = fit)

ci_fit_wo <- predict(model_fits$fit_wo, 
                     newdata = data_seq,
                     interval = "confidence", level = 0.95)  |>
 as.data.frame() |> 
 rename(ci_lwr_wo = lwr,
        ci_upr_wo = upr,
        ci_fit_wo = fit)

merged_data <- cbind(data_seq, pred_fit_w, ci_fit_w, pred_fit_wo, ci_fit_wo)

merged_data$pred_dif_wo_w <- merged_data$pred_fit_wo - merged_data$pred_fit_w

saveRDS(merged_data, "GCvsPFU_prediciton.RDS")

rm(dat1, data_seq, pred_fit_w, pred_fit_wo, merged_data)

Models agreement analyses and visualizations

# Bland-Altman analysis (Tukey mean-difference)
dat1 <- readRDS("GCvsPFU_prediciton.RDS")

blandr.statistics (dat1$pred_fit_wo, dat1$pred_fit_w, sig.level=0.95)
Bland-Altman Statistics
=======================
t = 400, df = 49, p-value = <2e-16
alternative hypothesis: true bias is not equal to 0

=======================
Number of comparisons:  50 
Maximum value for average measures:  8.42 
Minimum value for average measures:  2.84 
Maximum value for difference in measures:  0.23 
Minimum value for difference in measures:  0.216 

Bias:  0.223 
Standard deviation of bias:  0.00419 

Standard error of bias:  0.000592 
Standard error for limits of agreement:  0.00102 

Bias:  0.223 
Bias- upper 95% CI:  0.225 
Bias- lower 95% CI:  0.222 

Upper limit of agreement:  0.232 
Upper LOA- upper 95% CI:  0.234 
Upper LOA- lower 95% CI:  0.23 

Lower limit of agreement:  0.215 
Lower LOA- upper 95% CI:  0.217 
Lower LOA- lower 95% CI:  0.213 

=======================
Derived measures:  
Mean of differences/means:  4.4 
Point estimate of bias as proportion of lowest average:  7.87 
Point estimate of bias as proportion of highest average 2.65 
Spread of data between lower and upper LoAs:  0.0164 
Bias as proportion of LoA spread:  1362 

=======================
Bias: 
 0.223  ( 0.222  to  0.225 ) 
ULoA: 
 0.232  ( 0.23  to  0.234 ) 
LLoA: 
 0.215  ( 0.213  to  0.217 ) 
# Main difference and std dev of difference: Confidence
## check the assumption of normality for the differences
shapiro.test(dat1$pred_dif_wo_w)

    Shapiro-Wilk normality test

data:  dat1$pred_dif_wo_w
W = 1, p-value = 0.06
## t-test
t.test(
 dat1$pred_fit_wo, dat1$pred_fit_w, 
 alternative = c("two.sided"),
 paired = TRUE)

    Paired t-test

data:  dat1$pred_fit_wo and dat1$pred_fit_w
t = 377, df = 49, p-value <2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 0.222 0.225
sample estimates:
mean difference 
          0.223 
# Difference between W/O over W/ RNase treatment
dat1 <- readRDS("GCvsPFU_prediciton.RDS")
mean(dat1$pred_dif_wo_w)
[1] 0.223
sd(dat1$pred_dif_wo_w)/sqrt(length(dat1))
[1] 0.00112
mean(10^dat1$pred_dif_wo_w)
[1] 1.67
sd(10^dat1$pred_dif_wo_w)/sqrt(length(dat1))
[1] 0.00431
# Whether the w/o - w/ differ among different concentrations
aov1 <- lm(pred_dif_wo_w ~ pfu_rxn, data = dat1); summary(aov1)

Call:
lm(formula = pred_dif_wo_w ~ pfu_rxn, data = dat1)

Residuals:
      Min        1Q    Median        3Q       Max 
-8.97e-16 -2.67e-16 -4.52e-17  2.06e-16  9.16e-16 

Coefficients:
             Estimate Std. Error   t value Pr(>|t|)    
(Intercept)  2.28e-01   8.10e-17  2.81e+15   <2e-16 ***
pfu_rxn     -2.34e-03   3.21e-17 -7.30e+13   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.01e-16 on 48 degrees of freedom
Multiple R-squared:     1,  Adjusted R-squared:     1 
F-statistic: 5.34e+27 on 1 and 48 DF,  p-value: <2e-16
rm(dat1, aov1)

Plot of GC over PFU

dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
dat1 <- dat1[complete.cases(dat1), ]

dat2 <- readRDS("GCvsPFU_prediciton.RDS")

p_w <- ggplot () + 
geom_point (data=dat1 |> filter(rnase == "w"), 
            aes(x=pfu_rxn, y=pred_fit),
            color = "black", shape = 16, size=1.2) +
 stat_smooth (data=dat1 |> filter(rnase == "w"), 
              aes(x=pfu_rxn, y=pred_fit),
              color = c("#00529b"), linewidth=0.5, method="lm", se = FALSE, 
              formula = y~x) +
 geom_ribbon(data = dat2,
             aes(x=pfu_rxn, ymin = ci_lwr_w, ymax = ci_upr_w), 
             alpha = 0.2, fill = "gray30") + 
 geom_line(data = dat2, 
           aes(x = pfu_rxn, y = pred_lwr_w), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
 geom_line(data = dat2, 
           aes(x = pfu_rxn, y = pred_upr_w), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
  scale_x_continuous (breaks = c (-1, 0, 1, 2, 3, 4, 5),
                     limits = c(-1.2, 5)) +
 scale_y_continuous (breaks = c (2, 3, 4, 5, 6, 7, 8, 9),
                     limits = c(2, 9.1)) +
 labs (title = "With RNase",
       x = bquote (Log[10]~PFU~per~reaction), size = 3,
       y= bquote (Log[10]~GC~per~reaction), size = 3) +
 theme_bw (base_family="") +
 theme( 
  text = element_text(family = "Times"),
  legend.position = "inside",
  legend.position.inside = c(0.3, 0.9), 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face="bold", hjust = 0.5, vjust=0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.box.background=element_rect(fill="transparent", color="transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  legend.key=element_rect(fill="transparent", color="transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color=guide_legend(override.aes=list(fill=NA)))
 
p_wo <- ggplot () + 
geom_point (data=dat1 |> filter(rnase == "wo"), 
            aes(x=pfu_rxn, y=pred_fit),
            color = "black", shape = 16, size=1.2) +
 stat_smooth (data=dat1 |> filter(rnase == "wo"), 
              aes(x=pfu_rxn, y=pred_fit),
              color = c("#00529b"), linewidth=0.5, method="lm", se = FALSE, 
              formula = y~x) +
 geom_ribbon(data = dat2,
             aes(x= pfu_rxn, ymin = ci_lwr_wo, ymax = ci_upr_wo), 
             alpha = 0.2, fill = "gray30") + 
 geom_line(data = dat2, 
           aes(x = pfu_rxn, y = pred_lwr_wo), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
 geom_line(data = dat2, 
           aes(x = pfu_rxn, y = pred_upr_wo), 
           linetype = "longdash", color = "gray15", linewidth=0.1) +
 scale_x_continuous (breaks = c (-1, 0, 1, 2, 3, 4, 5),
                     limits = c(-1.2, 5)) +
 scale_y_continuous (breaks = c (2, 3, 4, 5, 6, 7, 8, 9),
                     limits = c(2, 9.1)) +
 labs (title = "Without RNase", 
       x = bquote (Log[10]~PFU~per~reaction), size = 3,
       y ="") +
 theme_bw (base_family="") +
 theme( 
  text = element_text(family = "Times"),
  legend.position = "inside",
  legend.position.inside = c(0.3, 0.9), 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face="bold", hjust = 0.5, vjust=0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.box.background=element_rect(fill="transparent", color="transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  legend.key=element_rect(fill="transparent", color="transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color=guide_legend(override.aes=list(fill=NA)))

combined_plot <- ggpubr::ggarrange(p_w, p_wo, 
                                   ncol = 2,
                                   labels = c("A", "B"),
                                   font.label = list(size=9, face="bold", family="times"
                                   ))
combined_plot

ggsave(plot=combined_plot, file="TuV_gc_pfu.tiff", width=18, height=7, units = c("cm"))

rm(dat1, dat2, p_w, p_wo)

GC:PFU ratio

GC:PFU for w/ vs. w/o RNase

# reading data for w/ and w/o RNase
dat1 <- readRDS("pred_GCfromPFU_ave.RDS")

# reading data for w/ and w/o RNase
dat1_w <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="w") |>
 as.data.frame()

dat1_wo <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="wo") |>
 as.data.frame()

# Average and std error
## W/ Rnase
mean(dat1_w$loggc2pfu)
[1] 3.72
sd(dat1_w$loggc2pfu)/sqrt(length(dat1_w))
[1] 0.0657
mean(dat1_w$gc2pfu)
[1] 5864
sd(dat1_w$gc2pfu)/sqrt(length(dat1_w))
[1] 832
## W/o Rnase
mean(dat1_wo$loggc2pfu)
[1] 3.94
sd(dat1_wo$loggc2pfu)/sqrt(length(dat1_wo))
[1] 0.0744
mean(dat1_wo$gc2pfu)
[1] 10525
sd(dat1_wo$gc2pfu)/sqrt(length(dat1_wo))
[1] 2259
# Whether the GC:PFU different for w/o vs w/ RNase
dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
dat1$pfu_rxn1 <- as.factor(dat1$pfu_rxn)
aov1 <- aov(loggc2pfu ~ rnase*pfu_rxn1, data = dat1);summary(aov1)
               Df Sum Sq Mean Sq F value Pr(>F)   
rnase           1  0.524   0.524   10.47 0.0031 **
pfu_rxn1        6  0.897   0.150    2.99 0.0220 * 
rnase:pfu_rxn1  6  0.064   0.011    0.21 0.9694   
Residuals      28  1.401   0.050                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pairwise comparison by separating w/ and w/o rnase. pfu_rxn has to be a factor
## W/ RNase
aov1 <- aov(loggc2pfu ~ pfu_rxn, data = dat1_w); summary(aov1)
            Df Sum Sq Mean Sq F value Pr(>F)   
pfu_rxn      1  0.403   0.403    12.1 0.0025 **
Residuals   19  0.632   0.033                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(dat1_w$loggc2pfu, dat1_w$pfu_rxn, 
                p.adjust.method = "bonf", 
                alternative = c("two.sided"))

    Pairwise comparisons using t tests with pooled SD 

data:  dat1_w$loggc2pfu and dat1_w$pfu_rxn 

     -1.2 -0.2 0.8 1.8 2.8 3.8
-0.2 1.0  -    -   -   -   -  
0.8  1.0  1.0  -   -   -   -  
1.8  1.0  1.0  1.0 -   -   -  
2.8  1.0  1.0  1.0 1.0 -   -  
3.8  1.0  1.0  1.0 1.0 1.0 -  
4.8  0.5  0.3  1.0 1.0 1.0 1.0

P value adjustment method: bonferroni 
# W/O RNase
aov1 <- aov(loggc2pfu ~ pfu_rxn, data = dat1_wo); summary(aov1)
            Df Sum Sq Mean Sq F value Pr(>F)   
pfu_rxn      1  0.431   0.431    9.14  0.007 **
Residuals   19  0.896   0.047                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(dat1_wo$loggc2pfu, dat1_wo$pfu_rxn, 
                p.adjust.method = "bonf", 
                alternative = c("two.sided"))

    Pairwise comparisons using t tests with pooled SD 

data:  dat1_wo$loggc2pfu and dat1_wo$pfu_rxn 

     -1.2 -0.2 0.8 1.8 2.8 3.8
-0.2 1.0  -    -   -   -   -  
0.8  1.0  1.0  -   -   -   -  
1.8  1.0  1.0  1.0 -   -   -  
2.8  1.0  1.0  1.0 1.0 -   -  
3.8  0.8  1.0  1.0 1.0 1.0 -  
4.8  0.8  1.0  1.0 1.0 1.0 1.0

P value adjustment method: bonferroni 
# Boxplot of GC:PFU across RNase
p <- ggplot(dat1, aes(x = factor(pfu_rxn), y = loggc2pfu)) +
  geom_boxplot(fill = "grey80", color = "black") +
  labs(x = "log10 PFU per Reaction", 
       y = "logarithmic GC:PFU ratio",
       title = "GC:PFU ratio w/ and w/o RNase combined") +
  facet_wrap(~ rnase) +  # Separate the plots by the rnase variable
  theme_bw()

p

ggsave(plot=p, filename="loggc2pfu_W_WO_rnase.pdf", width=12, height=7, units = c("cm"))

rm(dat1, dat1_w, dat1_wo, p, aov1)

Beta distributions

iters=1000
# W/ RNase
## reading data for w/ and w/o RNase
dat1_w <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="w") |>
 as.data.frame()

# scaled value (min-max normalization)
min_loggc2pfu <- min(dat1_w$loggc2pfu)
max_loggc2pfu <- max(dat1_w$loggc2pfu)

dat1_w$loggc2pfu_s <- (dat1_w$loggc2pfu - min_loggc2pfu) / (max_loggc2pfu - min_loggc2pfu)

# change absolute 0 and 1 to avoid error on modeling
dat1_w$loggc2pfu_s[dat1_w$loggc2pfu_s == 0] <- 0.00001
dat1_w$loggc2pfu_s[dat1_w$loggc2pfu_s == 1] <- 0.99999

# Beta dist fit: W/ RNase
fit_beta <- fitdist(dat1_w$loggc2pfu_s, "beta", 
                    start = list(shape1 = 2, shape2 = 2),
                    method = 'mge'); summary (fit_beta)
Warning in fitdist(dat1_w$loggc2pfu_s, "beta", start = list(shape1 = 2, :
maximum GOF estimation has a default 'gof' argument set to 'CvM'
Fitting of the distribution ' beta ' by maximum goodness-of-fit 
Parameters : 
       estimate
shape1     3.72
shape2     2.58
Loglikelihood:  -36.9   AIC:  77.7   BIC:  79.8 
# Bootstrap simulation of uncertainty
f_beta_boot <- bootdist(fit_beta, bootmethod="param", niter = iters)
summary(f_beta_boot)
Parametric bootstrap medians and 95% percentile CI 
       Median 2.5% 97.5%
shape1   3.97 2.04  9.45
shape2   2.70 1.43  6.49

The estimation method converged only for 999 among 1000 iterations 
# parameters of Beta distribution (scaled)
shape1_w <- quantile(f_beta_boot$estim[, 1], probs = c(0.5))
shape2_w <- quantile(f_beta_boot$estim[, 2], probs = c(0.5))

rbeta.est.boot_w <- rbeta (iters, shape1_w, shape2_w)

# Plotting density distributions
data_seq <- seq(0, 1, length = 100)
dbeta2 <- dbeta (data_seq, shape1_w, shape2_w)

# Create a data frame for the beta distribution to overlay
dat1_beta <- data.frame(x = data_seq, density = dbeta2)

scale_back <- function(x) {
  x * (max_loggc2pfu - min_loggc2pfu) + min_loggc2pfu
}

x_ticks_scaled <- seq(0, 1, length.out = 6)
x_ticks_actual <- scale_back (x_ticks_scaled)

# Summaries
beta_sum <- quantile(rbeta.est.boot_w, probs = c(0.025, 0.50, 0.975))
(beta_sum_actual <- scale_back (beta_sum))
 2.5%   50% 97.5% 
 3.39  3.73  4.02 
est_0.5 <- beta_sum[2]
est_0.5_actual <- beta_sum_actual[2]

p_w <- ggplot(data=dat1_w, aes(x = loggc2pfu_s)) +
 geom_histogram(aes(y = after_stat(density)), bins = 15, 
                fill = "darkgray", color = "black") +
 geom_line(data=dat1_beta, 
           aes(x = x, y = density),
           color = "#fb6502", linetype = "dashed", linewidth = 0.5) +
 geom_rug(aes(x = loggc2pfu_s), sides = "b", color = "black") +
 geom_vline(xintercept = est_0.5,
             color = "#00529b", linetype = "solid", linewidth = 0.5) +
 annotate("text", x = est_0.5,
          y = Inf, label = paste("Median =", round(est_0.5_actual, 2)),
          vjust = 5, hjust = -0.2,
          color = "#00529b", size = 2.5,
          family = "times") + 
 scale_x_continuous(breaks = x_ticks_scaled, 
                    labels = round(x_ticks_actual, 1)) +
 scale_y_continuous (breaks = c (0, 1, 2, 3, 4, 5),
                    limits = c(0, 5)) +
 labs(
  title = "With RNase",
  x = bquote (Log[10]~GC:PFU~ratio),
  y = "Density") +
 theme_bw (base_family="") +
 theme( 
  text = element_text(family = "Times"),
  legend.position = "inside",
  legend.position.inside = c(0.3, 0.9), 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face="bold", hjust = 0.5, vjust=0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.box.background=element_rect(fill="transparent", color="transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  legend.key=element_rect(fill="transparent", color="transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color=guide_legend(override.aes=list(fill=NA)))

p_w

####
# W/o RNase
## reading data for w/ and w/o RNase
dat1_wo <- readRDS("pred_GCfromPFU_ave.RDS") |>
 dplyr::filter(rnase=="wo") |>
 as.data.frame()

# scaled value (min-max normalization)
min_loggc2pfu <- min(dat1_wo$loggc2pfu)
max_loggc2pfu <- max(dat1_wo$loggc2pfu)

dat1_wo$loggc2pfu_s <- (10^dat1_wo$loggc2pfu - 10^min_loggc2pfu) / (10^max_loggc2pfu - 10^min_loggc2pfu)

# Convert absolute 0 and 1 values to avoid calculation error
dat1_wo$loggc2pfu_s[dat1_wo$loggc2pfu_s == 0] <- 0.00001
dat1_wo$loggc2pfu_s[dat1_wo$loggc2pfu_s == 1] <- 0.99999

# Beta dist fit: W/o RNase
fit_beta <- fitdist(dat1_wo$loggc2pfu_s, "beta", 
                    start = list(shape1 = 2, shape2 = 2),
                    method = 'mge'); summary (fit_beta)
Warning in fitdist(dat1_wo$loggc2pfu_s, "beta", start = list(shape1 = 2, :
maximum GOF estimation has a default 'gof' argument set to 'CvM'
Fitting of the distribution ' beta ' by maximum goodness-of-fit 
Parameters : 
       estimate
shape1    0.879
shape2    4.286
Loglikelihood:  -20.7   AIC:  45.3   BIC:  47.4 
# Bootstrap simulation of uncertainty
f_beta_boot <- bootdist(fit_beta, bootmethod="param", niter = iters)
summary(f_beta_boot)
Parametric bootstrap medians and 95% percentile CI 
       Median  2.5% 97.5%
shape1  0.919 0.545  2.01
shape2  4.618 2.251 11.88
shape1_wo <- quantile(f_beta_boot$estim[, 1], probs = c(0.5))
shape2_wo <- quantile(f_beta_boot$estim[, 2], probs = c(0.5))

rbeta.est.boot_wo <- rbeta (iters, shape1_wo, shape2_wo)

# Plotting density distributions
data_seq <- seq(0, 1, length = 100)
dbeta2 <- dbeta (data_seq, shape1_wo, shape2_wo)

# Create a data frame for the beta distribution to overlay
dat1_beta <- data.frame(x = data_seq, density = dbeta2)

scale_back <- function(x) {
  x * (max_loggc2pfu - min_loggc2pfu) + min_loggc2pfu
}

x_ticks_scaled <- seq(0, 1, length.out = 6)
x_ticks_actual <- scale_back(x_ticks_scaled)

# Summaries
beta_sum <- quantile(rbeta.est.boot_wo, prob=c(0.025, 0.50, 0.975))
(beta_sum_actual <- scale_back (beta_sum))
 2.5%   50% 97.5% 
 3.59  3.72  4.10 
est_0.5 <- beta_sum[2]
est_0.5_actual <- beta_sum_actual[2]

p_wo <- ggplot(data=dat1_wo, aes(x = loggc2pfu_s)) +
 geom_histogram(aes(y = after_stat(density)), bins = 15, 
                fill = "darkgray", color = "black") +
 geom_line(data=dat1_beta, 
           aes(x = x, y = density),
           color = "#fb6502", linetype = "dashed", linewidth = 0.5) +
 geom_rug(aes(x = loggc2pfu_s), sides = "b", color = "black") +
 geom_vline(xintercept = est_0.5,
             color = "#00529b", linetype = "solid", linewidth = 0.5) +
 annotate("text", x = est_0.5,
          y = Inf, label = paste("Median =", round(est_0.5_actual, 2)),
          vjust = 5, hjust = -0.2,
          color = "#00529b", size = 2.5,
          family = "times") + 
 scale_x_continuous(breaks = x_ticks_scaled, 
                    labels = round(x_ticks_actual, 1)) +
 scale_y_continuous (breaks = c (0, 1, 2, 3, 4, 5),
                    limits = c(0, 5)) +
 labs(
   title = "Without RNase",
       x = bquote (Log[10]~GC:PFU~ratio),
       y = "") +
 theme_bw (base_family="") +
 theme( 
  text = element_text(family = "Times"),
  legend.position = "inside",
  legend.position.inside = c(0.3, 0.9), 
  legend.direction="horizontal", 
  legend.text=element_text(size = 6, face='bold'),
  legend.title=element_text (size = 6,face='bold'),
  plot.title = element_text(color="black", size=10, face="bold", hjust = 0.5, vjust=0.5),
  axis.text.x=element_text(size = 10, color="black", face="bold", hjust = 0.5, vjust = 0.5),
  axis.text.y=element_text (size = 10, color="black", face="bold"),
  axis.title.x = element_text (size = 10, vjust = -1),
  axis.title.y = element_text (size = 10, vjust = 2),
  panel.grid.major = element_line (color="darkgray", linewidth=0.1, linetype="solid"),
  panel.grid.minor = element_line (color="gray", linewidth=0.1, linetype="dashed"),
  panel.background = element_rect (fill =  "transparent"),
  legend.box.background=element_rect(fill="transparent", color="transparent"),
  legend.background = element_rect (fill = "transparent", color = "transparent"),
  legend.key=element_rect(fill="transparent", color="transparent"),
  plot.background = element_rect (fill = "transparent", color = "transparent")) +
 guides(color=guide_legend(override.aes=list(fill=NA)))

combined_plot <- ggpubr::ggarrange(p_w, p_wo, 
                                   ncol = 2,
                                   labels = c("A", "B"),
                                   font.label = list(size=9, face="bold", family="times"
                                   ))
combined_plot

ggsave(plot=combined_plot, file="GC2PFU_beta_dist.tiff", width=18, height=7, units = c("cm"))

# Compare distributions
## Asymptotic two-sample Kolmogorov-Smirnov test
ks_out <- ks.test(rbeta.est.boot_w, rbeta.est.boot_wo, alternative = "two.sided")
ks_out

    Asymptotic two-sample Kolmogorov-Smirnov test

data:  rbeta.est.boot_w and rbeta.est.boot_wo
D = 0.8, p-value <2e-16
alternative hypothesis: two-sided
## Wilcoxon rank sum test: comparing locations
wx_out <- wilcox.test(rbeta.est.boot_w, rbeta.est.boot_wo, alternative = "two.sided", 
                      conf.int = TRUE, conf.level = 0.95)
wx_out

    Wilcoxon rank sum test with continuity correction

data:  rbeta.est.boot_w and rbeta.est.boot_wo
W = 1e+06, p-value <2e-16
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
 0.415 0.445
sample estimates:
difference in location 
                  0.43 
rm(dat1_wo, iters, fit_beta, shape1_wo, shape2_wo, f_beta_boot, dbeta2, data_seq, dat1_beta, rbeta.est.boot_w, rbeta.est.boot_wo, combined_plot, ks_out, wx_out, est_0.5, est_0.5_actual, x_ticks_actual, x_ticks_scaled, scale_back, dat1_w, shape1_w, shape2_w)

PFU over GC:PFU ratio predictive power

PFU ~ GC:PFU

# Read data
dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# Subset w/ and wo RNase
dat1_w <- dat1 |>
 dplyr::filter(rnase=="w") |>
 as.data.frame()

dat1_wo <- dat1 |>
 dplyr::filter(rnase=="wo") |>
 as.data.frame()

# Fitting the model and predict using global models
# W/ Rnase
fit_w <- lm(pfu_rxn ~ loggc2pfu, data = dat1_w); summary(fit_w)

Call:
lm(formula = pfu_rxn ~ loggc2pfu, data = dat1_w)

Residuals:
   Min     1Q Median     3Q    Max 
-3.418 -0.841  0.009  1.130  2.743 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)    22.69       6.01    3.78   0.0013 **
loggc2pfu      -5.62       1.61   -3.48   0.0025 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.64 on 19 degrees of freedom
Multiple R-squared:  0.39,  Adjusted R-squared:  0.358 
F-statistic: 12.1 on 1 and 19 DF,  p-value: 0.00249
sqrt(mean(fit_w$residuals^2))
[1] 1.56
coef(fit_w)
(Intercept)   loggc2pfu 
      22.69       -5.62 
confint(fit_w)
            2.5 % 97.5 %
(Intercept)  10.1  35.27
loggc2pfu    -9.0  -2.24
# W/o Rnase
fit_wo <- lm(pfu_rxn ~ loggc2pfu, data = dat1_wo); summary(fit_wo)

Call:
lm(formula = pfu_rxn ~ loggc2pfu, data = dat1_wo)

Residuals:
   Min     1Q Median     3Q    Max 
-3.847 -1.090  0.429  1.088  2.953 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)    19.66       5.92    3.32   0.0036 **
loggc2pfu      -4.53       1.50   -3.02   0.0070 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.73 on 19 degrees of freedom
Multiple R-squared:  0.325, Adjusted R-squared:  0.289 
F-statistic: 9.14 on 1 and 19 DF,  p-value: 0.00699
sqrt(mean(fit_wo$residuals^2))
[1] 1.64
coef(fit_wo)
(Intercept)   loggc2pfu 
      19.66       -4.53 
confint(fit_wo)
            2.5 % 97.5 %
(Intercept)  7.27   32.0
loggc2pfu   -7.67   -1.4
list(fit_w=fit_w, fit_ws=summary(fit_w),
     fit_wo=fit_wo, fit_wos=summary(fit_wo))|>
 saveRDS("logGC2PFU_to_PFU_model_fits.RDS")

rm(dat1, dat1_w, dat1_wo, fit_w, fit_wo)

PFU ~ log GC:PFU predictions

# read data and model fit
dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
dat1 <- dat1[complete.cases(dat1), ]

model_fits <- readRDS("logGC2PFU_to_PFU_model_fits.RDS")

# Create data sequence
data_seq <- data.frame(loggc2pfu=seq(min(dat1$loggc2pfu), max(dat1$loggc2pfu), length.out = 50))

## W/ RNase
pred_fit_w <- predict(model_fits$fit_w, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_w = lwr,
        pred_upr_w = upr,
        pred_fit_w = fit)

ci_fit_w <- predict(model_fits$fit_w, 
                     newdata = data_seq,
                     interval = "confidence", level = 0.95)  |>
 as.data.frame() |> 
 rename(ci_lwr_w = lwr,
        ci_upr_w = upr,
        ci_fit_w = fit)

## W/o RNase
pred_fit_wo <- predict(model_fits$fit_wo, 
                     newdata = data_seq,
                     interval = "prediction", level = 0.95) |>
 as.data.frame() |> 
 rename(pred_lwr_wo = lwr,
        pred_upr_wo = upr,
        pred_fit_wo = fit)

ci_fit_wo <- predict(model_fits$fit_wo, 
                     newdata = data_seq,
                     interval = "confidence", level = 0.95)  |>
 as.data.frame() |> 
 rename(ci_lwr_wo = lwr,
        ci_upr_wo = upr,
        ci_fit_wo = fit)

merged_data <- cbind(data_seq, pred_fit_w, ci_fit_w, pred_fit_wo, ci_fit_w)

merged_data$pred_dif_wo_w <- merged_data$pred_fit_wo - merged_data$pred_fit_w

saveRDS(merged_data, "logGC2PFU_to_PFU_model_prediciton.RDS")

rm(dat1, data_seq, model_fits, ci_fit_w, ci_fit_wo, pred_fit_wo, pred_fit_w, merged_data)

Models agreement analyses and visualizations

# Bland-Altman analysis (Tukey mean-difference)
dat1 <- readRDS("logGC2PFU_to_PFU_model_prediciton.RDS")

# W/ RNase
blandr.statistics (dat1$pred_fit_wo, dat1$pred_fit_w, sig.level=0.95)
Bland-Altman Statistics
=======================
t = 20, df = 49, p-value = <2e-16
alternative hypothesis: true bias is not equal to 0

=======================
Number of comparisons:  50 
Maximum value for average measures:  5.13 
Minimum value for average measures:  -1.94 
Maximum value for difference in measures:  1.92 
Minimum value for difference in measures:  0.408 

Bias:  1.17 
Standard deviation of bias:  0.451 

Standard error of bias:  0.0638 
Standard error for limits of agreement:  0.11 

Bias:  1.17 
Bias- upper 95% CI:  1.29 
Bias- lower 95% CI:  1.04 

Upper limit of agreement:  2.05 
Upper LOA- upper 95% CI:  2.27 
Upper LOA- lower 95% CI:  1.83 

Lower limit of agreement:  0.282 
Lower LOA- upper 95% CI:  0.502 
Lower LOA- lower 95% CI:  0.061 

=======================
Derived measures:  
Mean of differences/means:  -13 
Point estimate of bias as proportion of lowest average:  -60.1 
Point estimate of bias as proportion of highest average 22.7 
Spread of data between lower and upper LoAs:  1.77 
Bias as proportion of LoA spread:  65.9 

=======================
Bias: 
 1.17  ( 1.04  to  1.29 ) 
ULoA: 
 2.05  ( 1.83  to  2.27 ) 
LLoA: 
 0.282  ( 0.061  to  0.502 ) 
# Difference between W/O over W/ RNase treatment
mean(dat1$pred_dif_wo_w)
[1] 1.17
sd(dat1$pred_dif_wo_w)/sqrt(length(dat1))
[1] 0.121
mean(10^dat1$pred_dif_wo_w)
[1] 23.7
sd(10^dat1$pred_dif_wo_w)/sqrt(length(dat1))
[1] 6.02
## check the assumption of normality for the differences
shapiro.test(dat1$pred_dif_wo_w)

    Shapiro-Wilk normality test

data:  dat1$pred_dif_wo_w
W = 1, p-value = 0.06
## t-test
t.test(
 dat1$pred_fit_wo, dat1$pred_fit_w, 
 alternative = c("two.sided"),
 paired = TRUE)

    Paired t-test

data:  dat1$pred_fit_wo and dat1$pred_fit_w
t = 18, df = 49, p-value <2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 1.04 1.29
sample estimates:
mean difference 
           1.17 

Model validation through train data (using original data)

dat1 <- readRDS("pred_GCfromPFU_ave.RDS")
dat1 <- dat1[complete.cases(dat1), ]

# Initialize a list to store the RMSE results
rmse_results <- list()
r2_results <- list()
train_data_list <- list()
test_data_list <- list()

# Train data w/ RNase
dat1_w <- dat1 |>
  filter(rnase=="w")

index <- dat1_w$loggc2pfu |>
  createDataPartition(p = 0.70, times = 5, list = TRUE)

# Loop over each partition and check for data size
for (i in 1:length(index)) {
  train_data <- dat1_w[index[[i]], ]
  test_data <- dat1_w[-index[[i]], ]
  
  # Check if both train and test sets have more than one row
  if (nrow(train_data) > 1 & nrow(test_data) > 1) {
    # Train the model on train_data
    model <- train(pfu_rxn ~ loggc2pfu, 
                   data = train_data, 
                   method = "lm", 
                   trControl = trainControl(method = "boot", number = 500))
    
    # Predict on test data
    pred <- predict(model, newdata = test_data)
    
    # Calculate RMSE
    rmse <- caret::RMSE(pred = pred, obs = test_data$pfu_rxn)
    r2 <- caret::R2(pred = pred, obs = test_data$pfu_rxn)    
    
    # Store the result
    rmse_results[[i]] <- rmse
    r2_results[[i]] <- r2
    train_data_list[[i]] <- train_data
    test_data_list[[i]] <- test_data 
    
  } else {
    # If either train or test set is too small, skip this iteration
    cat("Skipping iteration", i, "due to small data size\n")
  }
}
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
# Check if any valid RMSE results were obtained
rmse_results; summary(unlist(rmse_results))
[[1]]
[1] 1.35

[[2]]
[1] 1.24

[[3]]
[1] 1.06

[[4]]
[1] 1.83

[[5]]
[1] 1.17
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.06    1.17    1.24    1.33    1.35    1.83 
r2_results; summary(unlist(r2_results))
[[1]]
[1] 0.507

[[2]]
[1] 0.283

[[3]]
[1] 0.834

[[4]]
[1] 0.622

[[5]]
[1] 0.933
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.283   0.507   0.622   0.636   0.834   0.933 
train_data_list[1]
[[1]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
1    0     4.8     w   1     8.37     8.09     8.64   8.37   8.30   8.43
2    0     4.8     w   2     7.96     7.69     8.23   7.96   7.90   8.02
3    0     4.8     w   3     8.47     8.19     8.75   8.47   8.40   8.54
4    1     3.8     w   1     7.17     6.90     7.44   7.17   7.12   7.21
5    1     3.8     w   2     7.54     7.26     7.81   7.54   7.49   7.59
7    2     2.8     w   1     6.52     6.25     6.79   6.52   6.48   6.56
8    2     2.8     w   2     6.59     6.32     6.86   6.59   6.55   6.63
9    2     2.8     w   3     6.54     6.27     6.81   6.54   6.50   6.58
10   3     1.8     w   1     5.52     5.25     5.79   5.52   5.48   5.55
12   3     1.8     w   3     5.53     5.26     5.80   5.53   5.49   5.56
13   4     0.8     w   1     4.35     4.08     4.62   4.35   4.30   4.39
15   4     0.8     w   3     4.83     4.56     5.10   4.83   4.79   4.87
16   5    -0.2     w   1     3.72     3.45     4.00   3.72   3.67   3.78
17   5    -0.2     w   2     3.60     3.33     3.88   3.60   3.55   3.66
18   5    -0.2     w   3     3.83     3.55     4.10   3.83   3.77   3.88
19   6    -1.2     w   1     2.44     2.16     2.72   2.44   2.37   2.52
20   6    -1.2     w   2     2.68     2.41     2.96   2.68   2.61   2.76
   loggc2pfu gc2pfu
1       3.57   3691
2       3.16   1445
3       3.67   4680
4       3.37   2332
5       3.74   5453
7       3.72   5293
8       3.79   6112
9       3.74   5482
10      3.72   5219
12      3.73   5346
13      3.55   3545
15      4.03  10746
16      3.92   8357
17      3.80   6358
18      4.03  10630
19      3.64   4381
20      3.88   7668
test_data_list
[[1]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
6    1     3.8     w   3     7.46     7.19     7.74   7.46   7.42   7.51
11   3     1.8     w   2     5.20     4.93     5.47   5.20   5.16   5.24
14   4     0.8     w   2     4.60     4.33     4.88   4.60   4.56   4.65
21   6    -1.2     w   3     2.91     2.63     3.19   2.91   2.84   2.98
   loggc2pfu gc2pfu
6       3.66   4623
11      3.40   2506
14      3.80   6376
21      4.11  12911

[[2]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
10   3     1.8     w   1     5.52     5.25     5.79   5.52   5.48   5.55
13   4     0.8     w   1     4.35     4.08     4.62   4.35   4.30   4.39
14   4     0.8     w   2     4.60     4.33     4.88   4.60   4.56   4.65
16   5    -0.2     w   1     3.72     3.45     4.00   3.72   3.67   3.78
   loggc2pfu gc2pfu
10      3.72   5219
13      3.55   3545
14      3.80   6376
16      3.92   8357

[[3]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
4    1     3.8     w   1     7.17     6.90     7.44   7.17   7.12   7.21
6    1     3.8     w   3     7.46     7.19     7.74   7.46   7.42   7.51
14   4     0.8     w   2     4.60     4.33     4.88   4.60   4.56   4.65
21   6    -1.2     w   3     2.91     2.63     3.19   2.91   2.84   2.98
   loggc2pfu gc2pfu
4       3.37   2332
6       3.66   4623
14      3.80   6376
21      4.11  12911

[[4]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
10   3     1.8     w   1     5.52     5.25     5.79   5.52   5.48   5.55
11   3     1.8     w   2     5.20     4.93     5.47   5.20   5.16   5.24
17   5    -0.2     w   2     3.60     3.33     3.88   3.60   3.55   3.66
20   6    -1.2     w   2     2.68     2.41     2.96   2.68   2.61   2.76
   loggc2pfu gc2pfu
10      3.72   5219
11      3.40   2506
17      3.80   6358
20      3.88   7668

[[5]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
2    0     4.8     w   2     7.96     7.69     8.23   7.96   7.90   8.02
7    2     2.8     w   1     6.52     6.25     6.79   6.52   6.48   6.56
8    2     2.8     w   2     6.59     6.32     6.86   6.59   6.55   6.63
15   4     0.8     w   3     4.83     4.56     5.10   4.83   4.79   4.87
   loggc2pfu gc2pfu
2       3.16   1445
7       3.72   5293
8       3.79   6112
15      4.03  10746
rm(dat1_w, r2_results, rmse_results, test_data, train_data)

######
# W/o RNase
# Train data w/ RNase
dat1_wo <- dat1 |>
  filter(rnase=="wo")

rmse_results <- list()
r2_results <- list()
train_data_list <- list()
test_data_list <- list()

index <- dat1_wo$loggc2pfu |>
  createDataPartition(p = 0.70, times = 5, list = TRUE)

# Loop over each partition and check for data size
for (i in 1:length(index)) {
  train_data <- dat1_wo[index[[i]], ]
  test_data <- dat1_wo[-index[[i]], ]
  
  # Check if both train and test sets have more than one row
  if (nrow(train_data) > 1 & nrow(test_data) > 1) {
    # Train the model on train_data
    model <- train(pfu_rxn ~ loggc2pfu, 
                   data = train_data, 
                   method = "lm", 
                   trControl = trainControl(method = "boot", number = 500))
    
    # Predict on test data
    pred <- predict(model, newdata = test_data)
    
    # Calculate RMSE
    rmse <- caret::RMSE(pred = pred, obs = test_data$pfu_rxn)
    r2 <- caret::R2(pred = pred, obs = test_data$pfu_rxn)    
    
    # Store the result
    rmse_results[[i]] <- rmse
    r2_results[[i]] <- r2
    train_data_list[[i]] <- train_data
    test_data_list[[i]] <- test_data 
    
  } else {
    # If either train or test set is too small, skip this iteration
    cat("Skipping iteration", i, "due to small data size\n")
  }
}

# Check if any valid RMSE results were obtained
rmse_results; summary(unlist(rmse_results))
[[1]]
[1] 1.63

[[2]]
[1] 2.1

[[3]]
[1] 1.71

[[4]]
[1] 1.76

[[5]]
[1] 0.956
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.956   1.632   1.705   1.631   1.764   2.100 
r2_results; summary(unlist(r2_results))
[[1]]
[1] 0.626

[[2]]
[1] 0.32

[[3]]
[1] 0.86

[[4]]
[1] 0.0578

[[5]]
[1] 0.98
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.058   0.320   0.626   0.569   0.860   0.980 
train_data_list[1]
[[1]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
1    0     4.8    wo   1     8.38     8.11     8.66   8.38   8.32   8.45
2    0     4.8    wo   2     8.57     8.30     8.85   8.57   8.51   8.64
3    0     4.8    wo   3     8.73     8.45     9.01   8.73   8.66   8.80
5    1     3.8    wo   2     7.65     7.38     7.92   7.65   7.60   7.70
6    1     3.8    wo   3     7.49     7.22     7.76   7.49   7.44   7.54
7    2     2.8    wo   1     6.61     6.34     6.88   6.61   6.57   6.65
8    2     2.8    wo   2     6.69     6.42     6.97   6.69   6.65   6.74
9    2     2.8    wo   3     6.83     6.55     7.10   6.83   6.78   6.87
10   3     1.8    wo   1     5.46     5.19     5.73   5.46   5.43   5.50
11   3     1.8    wo   2     5.70     5.43     5.98   5.70   5.67   5.74
12   3     1.8    wo   3     5.96     5.69     6.23   5.96   5.92   5.99
13   4     0.8    wo   1     4.45     4.18     4.72   4.45   4.41   4.50
14   4     0.8    wo   2     5.09     4.82     5.36   5.09   5.05   5.12
17   5    -0.2    wo   2     3.80     3.52     4.07   3.80   3.74   3.85
18   5    -0.2    wo   3     4.08     3.80     4.35   4.08   4.03   4.13
19   6    -1.2    wo   1     2.55     2.27     2.83   2.55   2.48   2.63
21   6    -1.2    wo   3     3.35     3.08     3.63   3.35   3.29   3.41
   loggc2pfu gc2pfu
1       3.58   3835
2       3.77   5950
3       3.93   8490
5       3.85   7105
6       3.69   4905
7       3.81   6507
8       3.89   7839
9       4.03  10597
10      3.66   4594
11      3.90   8029
12      4.16  14379
13      3.65   4491
14      4.29  19324
17      4.00   9890
18      4.28  18913
19      3.75   5656
21      4.55  35655
test_data_list
[[1]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
4    1     3.8    wo   1     7.54     7.27     7.81   7.54   7.49   7.59
15   4     0.8    wo   3     4.76     4.49     5.03   4.76   4.72   4.80
16   5    -0.2    wo   1     3.67     3.40     3.94   3.67   3.61   3.72
20   6    -1.2    wo   2     3.16     2.89     3.44   3.16   3.10   3.22
   loggc2pfu gc2pfu
4       3.74   5473
15      3.96   9049
16      3.87   7402
20      4.36  22942

[[2]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
3    0     4.8    wo   3     8.73     8.45     9.01   8.73   8.66   8.80
5    1     3.8    wo   2     7.65     7.38     7.92   7.65   7.60   7.70
13   4     0.8    wo   1     4.45     4.18     4.72   4.45   4.41   4.50
21   6    -1.2    wo   3     3.35     3.08     3.63   3.35   3.29   3.41
   loggc2pfu gc2pfu
3       3.93   8490
5       3.85   7105
13      3.65   4491
21      4.55  35655

[[3]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
2    0     4.8    wo   2     8.57     8.30     8.85   8.57   8.51   8.64
6    1     3.8    wo   3     7.49     7.22     7.76   7.49   7.44   7.54
9    2     2.8    wo   3     6.83     6.55     7.10   6.83   6.78   6.87
14   4     0.8    wo   2     5.09     4.82     5.36   5.09   5.05   5.12
   loggc2pfu gc2pfu
2       3.77   5950
6       3.69   4905
9       4.03  10597
14      4.29  19324

[[4]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
5    1     3.8    wo   2     7.65     7.38     7.92   7.65   7.60   7.70
13   4     0.8    wo   1     4.45     4.18     4.72   4.45   4.41   4.50
14   4     0.8    wo   2     5.09     4.82     5.36   5.09   5.05   5.12
17   5    -0.2    wo   2     3.80     3.52     4.07   3.80   3.74   3.85
   loggc2pfu gc2pfu
5       3.85   7105
13      3.65   4491
14      4.29  19324
17      4.00   9890

[[5]]
   dil pfu_rxn rnase rep pred_fit pred_lwr pred_upr ci_fit ci_lwr ci_upr
4    1     3.8    wo   1     7.54     7.27     7.81   7.54   7.49   7.59
7    2     2.8    wo   1     6.61     6.34     6.88   6.61   6.57   6.65
11   3     1.8    wo   2     5.70     5.43     5.98   5.70   5.67   5.74
20   6    -1.2    wo   2     3.16     2.89     3.44   3.16   3.10   3.22
   loggc2pfu gc2pfu
4       3.74   5473
7       3.81   6507
11      3.90   8029
20      4.36  22942